library(dplyr)
library(texreg)

# LOAD DATA
db = src_sqlite("C:\\Users\\Daniel\\Dropbox\\Journal (II)\\Replication Files\\Arel-Bundock, Vincent Replication\\replication\\replication_20160118.sqlite")
library(dplyr)
cmd = 'SELECT sample.iso2c_sub, sample.id_guo, guo.iso2c_guo, sample.sub_dummy_host, predictors.*, distance.distw, wdi.*
       FROM sample
       LEFT JOIN predictors ON sample.iso2c_sub=predictors.iso2c
       LEFT JOIN wdi ON sample.iso2c_sub=wdi.iso2c
       LEFT JOIN guo ON sample.id_guo=guo.id_guo
       LEFT JOIN distance ON distance.iso2c_guo=guo.iso2c_guo AND distance.iso2c_sub=sample.iso2c_sub'
dat = tbl(db, sql(cmd)) %>% collect %>% data.frame %>%
      mutate(gdp = log(gdp),
             gdppc = log(gdppc),
             distw = distw / 1000)

# TABLE 3 
is = data.frame(collect(tbl(db, 'investment_shares')))
income = is %>% group_by(income) %>%
         select(matches('orbis|fdi')) %>%
         summarise_each(funs(sum)) %>%
         dplyr::select(income, orbis_inflow, fdi_in, orbis_outflow, fdi_out)
income = data.frame(income[c(1, 4:2),])
write.csv(income, file='tables/table_3.csv')

# TABLE 4
estimate = function(controls, political='p_polity2'){ 
    f_apolitical = controls
    f_political = update(f_apolitical, paste('.~.+', political))
    vars = all.vars(f_political)
    dat_tmp = na.omit(dat[, vars])
    set.seed(1024)
    idx_train = sample(1:nrow(dat_tmp), nrow(dat_tmp)/2, replace=FALSE)
    idx_test = 1:nrow(dat_tmp)
    idx_test = idx_test[!idx_test %in% idx_train]
    dat_train = dat_tmp[idx_train,]
    dat_test = dat_tmp[idx_test,]
    score = function(mod){
        pred = predict(mod, newdata=dat_test, type='response')
        pred = ifelse(pred >= .5, 1, 0)
        pred = (mean(pred == dat_test$sub_dummy_host) * 100) %>% round(1)
        return(pred)
    }
    mod_apolitical = glm(f_apolitical, data=dat_train, family=binomial())
    mod_political = glm(f_political, data=dat_train, family=binomial())
    mod_apolitical$score = score(mod_apolitical)
    mod_political$score = score(mod_political)
    out = list(mod_apolitical, mod_political)
    return(out)
}

table_4 = estimate(sub_dummy_host ~ gdp + distw, 'p_polity2')
table_4_scores = sapply(table_4, function(x) x$score)
screenreg(table_4, digits=4, stars=NULL, file='tables/table_4.txt')
cat(table_4_scores, file='tables/table_4_scores.txt')

# TABLE 24
vars = grep('prs', colnames(dat), value=TRUE)
models = lapply(vars, function(x) estimate(sub_dummy_host ~ gdp + distw, x))
scores = sapply(models, function(x) x[[2]]$score)
scores = c(models[[1]][[1]]$score, scores)
thin = data.frame('Model'=c('Apolitical', vars), 'Thin'=scores)
vars = grep('prs', colnames(dat), value=TRUE)
models = lapply(vars, function(x) estimate(sub_dummy_host ~ gdp + distw + pop_urban + telephones + gdppc, x))
scores = sapply(models, function(x) x[[2]]$score)
scores = c(models[[1]][[1]]$score, scores)
thick = data.frame('Model'=c('Apolitical', vars), 'Thick'=scores)
table_24 = merge(thin, thick)
write.csv(table_24, file='tables/table_24.txt', row.names=FALSE)
